data <- read.csv("/Users/macbookpro/Desktop/Crow/data/wine_df.csv")
data$X <- NULL
head(data)
## alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols flavanoids
## 1 14.23 1.71 2.43 15.6 127 2.80 3.06
## 2 13.20 1.78 2.14 11.2 100 2.65 2.76
## 3 13.16 2.36 2.67 18.6 101 2.80 3.24
## 4 14.37 1.95 2.50 16.8 113 3.85 3.49
## 5 13.24 2.59 2.87 21.0 118 2.80 2.69
## 6 14.20 1.76 2.45 15.2 112 3.27 3.39
## nonflavanoid_phenols proanthocyanins color_intensity hue
## 1 0.28 2.29 5.64 1.04
## 2 0.26 1.28 4.38 1.05
## 3 0.30 2.81 5.68 1.03
## 4 0.24 2.18 7.80 0.86
## 5 0.39 1.82 4.32 1.04
## 6 0.34 1.97 6.75 1.05
## od280.od315_of_diluted_wines proline
## 1 3.92 1065
## 2 3.40 1050
## 3 3.17 1185
## 4 3.45 1480
## 5 2.93 735
## 6 2.85 1450
summary(data)
## alcohol malic_acid ash alcalinity_of_ash
## Min. :11.03 Min. :0.740 Min. :1.360 Min. :10.60
## 1st Qu.:12.36 1st Qu.:1.603 1st Qu.:2.210 1st Qu.:17.20
## Median :13.05 Median :1.865 Median :2.360 Median :19.50
## Mean :13.00 Mean :2.336 Mean :2.367 Mean :19.49
## 3rd Qu.:13.68 3rd Qu.:3.083 3rd Qu.:2.558 3rd Qu.:21.50
## Max. :14.83 Max. :5.800 Max. :3.230 Max. :30.00
## magnesium total_phenols flavanoids nonflavanoid_phenols
## Min. : 70.00 Min. :0.980 Min. :0.340 Min. :0.1300
## 1st Qu.: 88.00 1st Qu.:1.742 1st Qu.:1.205 1st Qu.:0.2700
## Median : 98.00 Median :2.355 Median :2.135 Median :0.3400
## Mean : 99.74 Mean :2.295 Mean :2.029 Mean :0.3619
## 3rd Qu.:107.00 3rd Qu.:2.800 3rd Qu.:2.875 3rd Qu.:0.4375
## Max. :162.00 Max. :3.880 Max. :5.080 Max. :0.6600
## proanthocyanins color_intensity hue od280.od315_of_diluted_wines
## Min. :0.410 Min. : 1.280 Min. :0.4800 Min. :1.270
## 1st Qu.:1.250 1st Qu.: 3.220 1st Qu.:0.7825 1st Qu.:1.938
## Median :1.555 Median : 4.690 Median :0.9650 Median :2.780
## Mean :1.591 Mean : 5.058 Mean :0.9574 Mean :2.612
## 3rd Qu.:1.950 3rd Qu.: 6.200 3rd Qu.:1.1200 3rd Qu.:3.170
## Max. :3.580 Max. :13.000 Max. :1.7100 Max. :4.000
## proline
## Min. : 278.0
## 1st Qu.: 500.5
## Median : 673.5
## Mean : 746.9
## 3rd Qu.: 985.0
## Max. :1680.0
Features are in much different scales. We have to scale the data before proceeding to clustering.
# Scale data.
data_scaled <- scale(data)
# Calculate PCA.
pca <- data.frame(prcomp(data_scaled)$x)
Let’s run repeated stochastic clustering using kmeans.
We will run the algorithm 30 times, check it’s clustering stability over
repeated iterations, and get the majority voting label for each
sample.
Will set the number of clusters to 5 for this first experiment.
scr <- cRowflow::stochastic_clustering_runner(
data_scaled,
kmeans,
labels_name = "cluster",
n_runs = 30,
centers=5
)
Let’s visualise the majority voting labels returned by
cRowflow.
create_3d_plot_pca(pca, scr$majority_voting_labels, T, title_addition = " - Clusters")
Let’s now examine the stability of each element over the repeated iterations.
summary(scr$ecc)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.3384 0.5060 0.6782 0.6455 0.7779 0.8668
The mean ECC is 0.65 and median is 0.68.
create_3d_plot_pca(pca, scr$ecc, title_addition = " - ECC")
The most unstable points are the ones at the boundaries between clusters 2, 3, and 5.
We can use cRowflow’s parameter_searcher
function to find optimal clustering parameter values that result in
reproducible and robust clustering results.
param_grid = list(centers=seq(3,8), algorithm = c("Hartigan-Wong", "Lloyd", "Forgy", "MacQueen"))
parameter_searcher <- cRowflow::parameter_searcher(
data_scaled,
kmeans,
labels_name = "cluster",
param_grid = param_grid,
n_runs = 30,
iter.max = 30
)
Let’s visualise the results when changing the number of clusters and the algorithm implementation.
cRowflow::plot_heatmap(parameter_searcher$results_df, "centers", "algorithm")
## Selected keys for visualization: centers, algorithm
## Creating DataFrame from parameter search results...
## No duplicates found. Proceeding without aggregation.
The heatmap displays the median ECC values for different combinations of
k-means clustering algorithms and the number of cluster centers. The
results indicate that lower center values (e.g., 3, 4) yield higher ECC
values, with the Hartigan-Wong implementation producing more stable
results in all cases. In contrast, higher center values (e.g., 8) lead
to lower ECC scores across all algorithms.
Let’s further optimize clustering stability when the number of
clusters (centers) is set to 4 and the implementation is
Hartigan-Wong by identifying the optimal feature subset using a genetic
algorithm. The current median ECC for that configuration is 0.79.
set.seed(42)
genetic_fs <- cRowflow::genetic_algorithm_feature_selector(
data_scaled,
kmeans,
labels_name = "cluster",
verbose = T,
n_generations_no_change = 5,
centers=4,
algorithm = "Hartigan-Wong",
iter.max=30
)
## Gen 0 - Best ECC: 0.9567
## Gen 1 - Best ECC: 0.9567
## Gen 2 - Best ECC: 0.9678
## Gen 3 - Best ECC: 0.9678
## Gen 4 - Best ECC: 0.9678
## Gen 5 - Best ECC: 0.9678
## Gen 6 - Best ECC: 0.9678
## Gen 7 - Best ECC: 0.9678
cRowflow::plot_ga_fitness_evolution(genetic_fs$history)
genetic_fs_best_fitness <- genetic_fs$best_fitness_scr_result
create_3d_plot_pca(pca, genetic_fs_best_fitness$ecc, title_addition = " - ECC AFTER GA OPTIMIZATION")
create_3d_plot_pca(pca, genetic_fs_best_fitness$majority_voting_labels, categorical = T, title_addition = " - CLUSTERS")
By keeping only 7 of the features we were able to produce a much more stable clustering with the same configurations (centers=4, algorithm = “Hartigan-Wong”, iter.max=30).
colnames(data_scaled[,genetic_fs$best_features])
## [1] "alcohol" "ash" "alcalinity_of_ash"
## [4] "total_phenols" "flavanoids" "hue"
## [7] "proline"
We can